Project One - Cities Rediscovering Themselves: The Aftermath of Local Law 18 in New York City's Airbnb Market Across the Boroughs¶
Step 1. Pre-Processing our data!¶
In [1]:
# In case you need to download these dependencies that are used in this notebook
# !pip install pandas==2.2.3 plotly==5.24.1 kaleido==0.1.0 seaborn==0.13.2 numpy==2.2.1 great-tables==0.15.0
In [2]:
# Got to load in our dependencies!
from datetime import datetime
import pandas as pd
import plotly.express as px
import plotly.offline as pyo
import seaborn as sns
import plotly.graph_objects as go
import numpy as np
from pandas.api.types import is_numeric_dtype
from great_tables import GT, md, html, system_fonts, style, loc
In [3]:
# First, we can load in our dataframes, starting with the overall look at all of the listings within the city and its general information for 2024
nov_listings: pd.DataFrame = pd.read_csv('./datasets/new_york_listings.csv')
# For historical reasons, let's also load in the same listings dataset but from July 2023
jul_23_listings: pd.DataFrame = pd.read_csv('./datasets/NYC-Airbnb-2023.csv')
C:\Users\ernie\AppData\Local\Temp\ipykernel_12408\4071149682.py:5: DtypeWarning: Columns (17) have mixed types. Specify dtype option on import or set low_memory=False.
In [4]:
nov_listings.head(3)
Out[4]:
| id | listing_url | scrape_id | last_scraped | source | name | description | neighborhood_overview | picture_url | host_id | ... | review_scores_communication | review_scores_location | review_scores_value | license | instant_bookable | calculated_host_listings_count | calculated_host_listings_count_entire_homes | calculated_host_listings_count_private_rooms | calculated_host_listings_count_shared_rooms | reviews_per_month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2595 | https://www.airbnb.com/rooms/2595 | 20241104040953 | 2024-11-04 | city scrape | Skylit Midtown Castle Sanctuary | Beautiful, spacious skylit studio in the heart... | Centrally located in the heart of Manhattan ju... | https://a0.muscache.com/pictures/miso/Hosting-... | 2845 | ... | 4.8 | 4.81 | 4.40 | NaN | f | 3 | 3 | 0 | 0 | 0.27 |
| 1 | 6848 | https://www.airbnb.com/rooms/6848 | 20241104040953 | 2024-11-04 | city scrape | Only 2 stops to Manhattan studio | Comfortable studio apartment with super comfor... | NaN | https://a0.muscache.com/pictures/e4f031a7-f146... | 15991 | ... | 4.8 | 4.69 | 4.58 | NaN | f | 1 | 1 | 0 | 0 | 1.04 |
| 2 | 6872 | https://www.airbnb.com/rooms/6872 | 20241104040953 | 2024-11-04 | city scrape | Uptown Sanctuary w/ Private Bath (Month to Month) | This charming distancing-friendly month-to-mon... | This sweet Harlem sanctuary is a 10-20 minute ... | https://a0.muscache.com/pictures/miso/Hosting-... | 16104 | ... | 5.0 | 5.00 | 5.00 | NaN | f | 2 | 0 | 2 | 0 | 0.03 |
3 rows × 75 columns
In [5]:
jul_23_listings.head(3)
Out[5]:
| id | name | host_id | host_name | neighbourhood_group | neighbourhood | latitude | longitude | room_type | price | minimum_nights | number_of_reviews | last_review | reviews_per_month | calculated_host_listings_count | availability_365 | number_of_reviews_ltm | license | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2595 | Skylit Midtown Castle | 2845 | Jennifer | Manhattan | Midtown | 40.75356 | -73.98559 | Entire home/apt | 150 | 30 | 49 | 2022-06-21 | 0.30 | 3 | 314 | 1 | NaN |
| 1 | 5121 | BlissArtsSpace! | 7356 | Garon | Brooklyn | Bedford-Stuyvesant | 40.68535 | -73.95512 | Private room | 60 | 30 | 50 | 2019-12-02 | 0.30 | 2 | 365 | 0 | NaN |
| 2 | 5203 | Cozy Clean Guest Room - Family Apt | 7490 | MaryEllen | Manhattan | Upper West Side | 40.80380 | -73.96751 | Private room | 75 | 2 | 118 | 2017-07-21 | 0.72 | 1 | 0 | 0 | NaN |
In [6]:
# I like to do this because sometimes the columns have whitespace or weird capitalization you don't even realize
nov_listings.columns = np.vectorize(lambda x: x.strip().lower())(nov_listings.columns)
# Our first pre-processing step can just be dropping columns we definitely, 100% don't need for our analysis
print(f'nov_listings currently has {len(nov_listings.columns)} columns')
nov_listings.drop(
columns=['picture_url',
'host_url',
'neighbourhood', #Not really the neighborhood
'host_thumbnail_url',
'host_picture_url',
'host_has_profile_pic',
'host_identity_verified',
'license',
],
inplace = True
)
# Take the dollar sign out of the price column so we can do some analysis with it
nov_listings['price'] = nov_listings["price"].apply(
lambda x: float(x.replace('$', '').replace(',','') if isinstance(x, str) else x)
)
print(f'nov_listings currently has {len(nov_listings.columns)} columns')
nov_listings currently has 75 columns nov_listings currently has 67 columns
In [7]:
# Can do the same for the 2023 listings, although there is a lot less data
jul_23_listings.columns = np.vectorize(lambda x: x.strip().lower())(jul_23_listings.columns)
print(f'jul_23_listings currently has {len(jul_23_listings.columns)} columns')
jul_23_listings.drop(
columns=['license', 'number_of_reviews_ltm'],
inplace=True)
print(f'jul_23_listings currently has {len(jul_23_listings.columns)} columns')
jul_23_listings currently has 18 columns jul_23_listings currently has 16 columns
In [8]:
# Now, let's take a look at NaN values for each dataframe
print(f"""The number of NaN values per column in nov_listings: \n
{nov_listings.isna().sum().sort_values(ascending=False)[:11]}'
"""
)
print(f"""
'The number of NaN values per column in jul_23_listings: \n
{jul_23_listings.isna().sum().sort_values(ascending=False)}
"""
)
# That's helpful, but doesn't really give me a gauge on what percentage of rows this is, so let's calculate that!
nov_nan_percentages: pd.DataFrame = pd.DataFrame((nov_listings.isna().sum().sort_values(ascending=False) / len(nov_listings.index) * 100).round(2), columns=['Missing Value Percentage']).reset_index(names=['Column Name'])
nov_nan_table: GT = (
GT(nov_nan_percentages[:10])
.tab_header(
title = html(
"<span style='font-size:20px; font-weight:bold;'>Missing values for the November 2024 Table</span>"
),
subtitle = html(
"<span style='font-size:15px; font-weight:bold;'>Sorted by the Percentage of the Column Missing</span>"
),
)
.tab_options(
table_font_names=system_fonts("industrial")
)
.data_color(
columns=['Missing Value Percentage'],
palette = 'RdPu',
)
)
jul_nan_percentages: pd.DataFrame = pd.DataFrame(
(
jul_23_listings.isna().sum().sort_values(ascending=False)[:20]
/ len(jul_23_listings.index) * 100
).round(2),
columns=['Missing Value Percentage']).reset_index(names=['Column Name'])
jul_nan_table: GT = (
GT(jul_nan_percentages[:10])
.tab_header(
title = html(
"<span style='font-size:20px; font-weight:bold;'>Missing values for the July 2023 Table</span>"
),
subtitle = html(
"<span style='font-size:15px; font-weight:bold;'>Sorted by the Percentage of the Column Missing</span>"
),
)
.tab_options(
table_font_names=system_fonts("industrial"),
)
.data_color(
columns=['Missing Value Percentage'],
palette = 'RdPu',
)
)
# Save our tables to a file
nov_nan_table.save(file='../img/november_2024_nan_table.png', scale = 14)
jul_nan_table.save(file='../img/july_2023_nan_table.png', scale = 12)
nov_nan_table.show()
jul_nan_table.show()
The number of NaN values per column in nov_listings:
calendar_updated 37548
neighborhood_overview 16974
host_about 16224
host_response_rate 15001
host_response_time 15001
host_acceptance_rate 14983
beds 14952
bathrooms 14809
price 14807
review_scores_location 11578
review_scores_value 11577
dtype: int64'
'The number of NaN values per column in jul_23_listings:
last_review 10304
reviews_per_month 10304
name 12
host_name 5
neighbourhood_group 0
neighbourhood 0
id 0
host_id 0
longitude 0
latitude 0
room_type 0
price 0
number_of_reviews 0
minimum_nights 0
calculated_host_listings_count 0
availability_365 0
dtype: int64
| Missing values for the November 2024 Table | |
| Sorted by the Percentage of the Column Missing | |
| Column Name | Missing Value Percentage |
|---|---|
| calendar_updated | 100.0 |
| neighborhood_overview | 45.21 |
| host_about | 43.21 |
| host_response_rate | 39.95 |
| host_response_time | 39.95 |
| host_acceptance_rate | 39.9 |
| beds | 39.82 |
| bathrooms | 39.44 |
| price | 39.43 |
| review_scores_location | 30.84 |
| Missing values for the July 2023 Table | |
| Sorted by the Percentage of the Column Missing | |
| Column Name | Missing Value Percentage |
|---|---|
| last_review | 24.0 |
| reviews_per_month | 24.0 |
| name | 0.03 |
| host_name | 0.01 |
| neighbourhood_group | 0.0 |
| neighbourhood | 0.0 |
| id | 0.0 |
| host_id | 0.0 |
| longitude | 0.0 |
| latitude | 0.0 |
In [9]:
# First, let's get rid of that pesky calendar_updated column
nov_listings.drop(columns=['calendar_updated'], inplace=True)
# Let's get all the numerical columns with more than 30% of their values missing
missing_columns = [name for name, val in (nov_listings.isna().sum().sort_values(ascending=False) / len(nov_listings.index) * 100).items() if val > .3 and is_numeric_dtype(nov_listings[name])]
# Create a function that can replace values in a column based for each borough
def fill_na_with_group_means(df: pd.DataFrame, col: str, group_col: str = 'neighbourhood_group_cleansed') -> pd.Series:
""" Returns a dictionary with the median for the grouped column that can be used to fill NaN values
Args:
df (pd.DataFrame): dataframe to utilize
col (str): column to take the median of
group_col (str, optional): column to group by Defaults to 'neighbourhood_group_cleansed'.
Returns:
pd.Series: series with the indexes as the grouped_by indexes and the values as the medians of each group for the specified column
"""
# print(df.groupby(group_col)[col].transform('median'))
return df[col].fillna(df.groupby(group_col)[col].transform('median'))
# Do it for every missing column
for col in missing_columns:
nov_listings[col] = fill_na_with_group_means(nov_listings, col)
From here, we would typically do the same for jul_23_listings, but based upon the analysis above, there aren't many important columns for null values at all, so we can leave that.
Step 2. Visualizations¶
In [10]:
# Set plotly to offline mode so we can display these visualizations
pyo.init_notebook_mode()
In [11]:
# For our first visualization, it might be helpful to look at for each borough how the number of listings have changed from 2023 to now
num_nov_of_listings: pd.Series = nov_listings.groupby('neighbourhood_group_cleansed').size()
num_jul_23_listings: pd.Series = jul_23_listings.groupby('neighbourhood_group').size()
total_diff: float = ((len(nov_listings) - len(jul_23_listings)) / len(jul_23_listings)) * -100
listings_change: pd.DataFrame = pd.DataFrame({
'borough': num_nov_of_listings.index,
'July 2023': num_jul_23_listings,
'November 2024': num_nov_of_listings
})
listings_change['percent_change'] = ((listings_change['November 2024'] - listings_change['July 2023']) / listings_change['July 2023']) * 100
listings_change_fig = px.bar(
data_frame = listings_change,
x = 'borough',
template='plotly_dark',
y = ['July 2023', 'November 2024'],
barmode='group',
labels = {'borough': 'Borough', 'variable': 'Month'},
color_discrete_map={'July 2023': '#a1c9f4', 'November 2024': '#8de5a1'}
)
# Update layout
listings_change_fig.update_layout(
yaxis = dict(title=dict(text='Number of Airbnb Listings')),
font_family = "Raleway, sans-serif",
title = dict(text=f'<b>How Local Law 18 changed the number of Airbnbs across NYC boroughs</b><br><sup>The number across the city decreased by {total_diff:.2f}% but that decrease varied across boroughs</sup>'),
legend = dict (
x = .5,
y = -.3,
orientation = 'h',
yanchor = 'bottom',
xanchor = 'center'
)
)
# Add percent change information above each bar
for i, row in listings_change.iterrows():
listings_change_fig.add_trace(go.Scatter(
x=[row['borough']],
y = [max(row['July 2023'], row['November 2024']) + 15],
text=[f"{row['percent_change']:.2f}%"],
mode="text",
showlegend=False,
textfont = dict(weight=600, size = 15)
))
# Have to do this weird thing where we save the image and then display it within the notebook because plotly graphs mess up my blog lol
listings_change_fig.write_image("../img/listings_change.png", scale=6, engine="kaleido")

In [12]:
jul_listings_change_map = px.scatter_map(
data_frame = jul_23_listings,
lat = 'latitude',
lon = 'longitude',
color = 'neighbourhood_group',
map_style = 'carto-darkmatter',
labels = {'neighbourhood_group': 'Borough'},
opacity = .5,
size_max = 10,
title = 'Geographic Distribution of Airbnbs Across New York City: July 2023',
zoom = 9
)
jul_listings_change_map.update_layout(
font_family = "Raleway, sans-serif",
legend = dict (
x = .5,
y = -.3,
orientation = 'h',
yanchor = 'bottom',
xanchor = 'center'
)
)
nov_listings_change_map = px.scatter_map(
data_frame = nov_listings,
lat = 'latitude',
lon = 'longitude',
color = 'neighbourhood_group_cleansed',
labels = {'neighbourhood_group_cleansed': 'Borough'},
map_style = 'carto-darkmatter',
opacity = .5,
size_max = 10,
title = 'Geographic Distribution of Airbnbs Across New York City: November 2024',
zoom = 9
)
nov_listings_change_map.update_layout(
font_family = "Raleway, sans-serif",
legend = dict (
x = .5,
y = -.3,
orientation = 'h',
yanchor = 'bottom',
xanchor = 'center'
)
)
# Save these maps to html so we can display them on the website
jul_listings_change_map.write_html('../plotly/jul_2023_airbnb_map.html')
nov_listings_change_map.write_html('../plotly/nov_2024_airbnb_map.html')
# Save these maps to pngs so we can display them within the notebook
jul_listings_change_map.write_image('../img/jul_listings_change_map.png', scale = 6, engine = 'kaleido')
nov_listings_change_map.write_image('../img/nov_listings_change_map.png', scale = 6, engine = 'kaleido')
# jul_listings_change_map.show()
# nov_listings_change_map.show()


That's some pretty cool insight, but the number of boroughs doesn't simply tell the entire story. How about the average prices? Let's explore that!
In [13]:
avg_price_nov_listings: pd.Series = nov_listings.groupby('neighbourhood_group_cleansed')['price'].mean()
avg_price_jul_23_listings: pd.Series = jul_23_listings.groupby('neighbourhood_group')['price'].mean()
avg_price_change: pd.DataFrame = pd.DataFrame({
'borough': avg_price_nov_listings.index,
'July 2023': avg_price_jul_23_listings,
'November 2024': avg_price_nov_listings,
})
avg_price_change['percent_change'] = ((avg_price_change['November 2024'] - avg_price_change['July 2023']) / avg_price_change['July 2023']) * 100
avg_price_fig = px.bar(
data_frame = avg_price_change,
x = 'borough',
y = ['July 2023', 'November 2024'],
barmode = 'group',
template = 'plotly_dark',
labels = {'borough': 'Borough', 'variable': 'Month'},
color_discrete_map={'July 2023': '#d0bbff', 'November 2024': '#fab0e4'}
)
# Update layout
avg_price_fig.update_layout(
yaxis = dict(title=dict(text='Average Price of Airbnb Listing ($)')),
font_family = "Raleway, sans-serif",
title = dict(text=f'<b>How Local Law 18 changed the price of Airbnbs across NYC boroughs</b><br><sup>Across the board, the prices of Airbnbs decreased despite the stark decrease in options available</sup>'),
legend = dict (
x = .5,
y = -.3,
orientation = 'h',
yanchor = 'bottom',
xanchor = 'center'
)
)
# Add percent change information above each bar
for i, row in avg_price_change.iterrows():
avg_price_fig.add_trace(go.Scatter(
x=[row['borough']],
y = [row['July 2023'] + 10],
text=[f"{row['percent_change']:.2f}%"],
mode="text",
showlegend=False,
textfont = dict(weight=600,size = 15)
))
avg_price_fig.write_image("../img/avg_price_change.png", scale=6, engine="kaleido")

In [14]:
# We will load a dataset from 2020 to help us drive this analysis
sep_2020_listings: pd.DataFrame = pd.read_csv('./datasets/sep_2020_listings.csv')
sep_2020_listings.head(5)
Out[14]:
| id | name | summary | description | experiences_offered | neighborhood_overview | transit | house_rules | host_id | host_since | ... | hot_tub_sauna_or_pool | internet | long_term_stays | pets_allowed | private_entrance | secure | self_check_in | smoking_allowed | accessible | event_suitable | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2539 | Clean & quiet apt home by the park | Renovated apt home in elevator building. | Renovated apt home in elevator building. Spaci... | none | Close to Prospect Park and Historic Ditmas Park | Very close to F and G trains and Express bus i... | -The security and comfort of all our guests is... | 2787 | 39698.0 | ... | -1 | 1 | 1 | -1 | -1 | 1 | 1 | -1 | 1 | 1 |
| 1 | 3647 | THE VILLAGE OF HARLEM....NEW YORK ! | NaN | WELCOME TO OUR INTERNATIONAL URBAN COMMUNITY T... | none | NaN | NaN | Upon arrival please have a legibile copy of yo... | 4632 | 39777.0 | ... | -1 | 1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 |
| 2 | 7750 | Huge 2 BR Upper East Cental Park | NaN | Large Furnished 2BR one block to Central Park... | none | NaN | NaN | NaN | 17985 | 39953.0 | ... | -1 | 1 | -1 | 1 | -1 | -1 | -1 | -1 | -1 | -1 |
| 3 | 8505 | Sunny Bedroom Across Prospect Park | Just renovated sun drenched bedroom in a quiet... | Just renovated sun drenched bedroom in a quiet... | none | Quiet and beautiful Windsor Terrace. The apart... | Ten minutes walk to the 15th sheet F&G train s... | - No shoes in the house - Quiet hours after 11... | 25326 | 40006.0 | ... | -1 | 1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 |
| 4 | 8700 | Magnifique Suite au N de Manhattan - vue Cloitres | Suite de 20 m2 a 5 min des 2 lignes de metro a... | Suite de 20 m2 a 5 min des 2 lignes de metro a... | none | NaN | Metro 1 et A | NaN | 26394 | 40014.0 | ... | -1 | 1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 |
5 rows × 81 columns
In [15]:
def convert_excel_serial_to_date(col: pd.Series) -> pd.Series:
"""
Convert an Excel-style serial date to a readable date format.
Args:
col (pd.Series): Numeric value representing a date in Excel format.
Returns:
pd.Series: Corresponding date in YYYY-MM-DD format.
"""
try:
return pd.to_datetime(col, origin='1899-12-30', unit='D')
except:
return None # Return None for invalid values
In [16]:
# Convert the host_since column here to an actual date (for some reason it's in some weird Excel format)
sep_2020_listings['host_since'] = sep_2020_listings['host_since'].apply(convert_excel_serial_to_date)
sep_2020_listings['host_since']
Out[16]:
0 2008-09-07
1 2008-11-25
2 2009-05-20
3 2009-07-12
4 2009-07-20
...
30174 2016-12-18
30175 2013-08-18
30176 2013-05-25
30177 2015-04-10
30178 2016-04-20
Name: host_since, Length: 30179, dtype: datetime64[ns] In [17]:
def calculate_age_of_host(col: pd.Series, year: int, month: int = 1) -> pd.Series:
"""
Calculate the amount of time has elapsed in years for a given column
Args:
col (pd.Series): Series of the dates (in datetime format)
Year (int): Year to calculate the date from
month (int, optional): Month to calculate the datetime with. Defaults to 1.
Returns:
pd.Series: series with the indexes as the grouped_by indexes and the values as the medians of each group for the specified column
"""
# Convert the year in which the data was collected to datetime
reference_date = datetime(year, month, 1)
# Make sure the column given is in datetime
col = pd.to_datetime(col)
# Calculate the amount of days that have elapsed
elapsed_time: pd.Series = reference_date - col
elapsed_years: pd.Series = elapsed_time.dt.days / 365.25
return elapsed_years
In [18]:
# Get the hosting age for November 2024
nov_listings['hosting_age'] = calculate_age_of_host(nov_listings['host_since'].dropna(), 2024, 11)
nov_average_age: np.float64 = np.average(nov_listings['hosting_age'].dropna())
# Get the hosting age for September 2020
sep_2020_listings['hosting_age'] = calculate_age_of_host(sep_2020_listings['host_since'].dropna(), 2020, 9)
sep_2020_average_age: np.float64 = np.average(sep_2020_listings['hosting_age'].dropna())
In [19]:
NUM_OF_BINS = 25
RAINBOW_COLORS = ["#ffadad","#ffd6a5","#fdffb6","#caffbf","#9bf6ff","#a0c4ff","#bdb2ff","#ffc6ff"]
RAINBOW_COLOR_SEQ = [RAINBOW_COLORS[i % len(RAINBOW_COLORS)] for i in range(NUM_OF_BINS)]
nov_listings_hosting_age = px.histogram(
data_frame = nov_listings,
x = 'hosting_age',
nbins = NUM_OF_BINS,
template = 'plotly_dark',
labels = {'count': 'Number of Airbnbs', 'hosting_age': 'Age of Airbnb (years)'},
)
nov_listings_hosting_age.update_layout(
font_family = "Raleway, sans-serif",
title = dict(text=f'<b>Hosting Age of Airbnb Hosts in November 2024</b><br><sup>In November 2024, the average age of a listing in New York City was {nov_average_age:.2f}</sup>'),
legend = dict (
x = .5,
y = -.3,
orientation = 'h',
yanchor = 'bottom',
xanchor = 'center'
)
)
for bar in nov_listings_hosting_age.data:
bar.marker.color = RAINBOW_COLOR_SEQ
# nov_listings_hosting_age.show()

In [20]:
sep_2020_listings_hosting_age = px.histogram(
data_frame = sep_2020_listings,
x = 'hosting_age',
nbins = NUM_OF_BINS,
template = 'plotly_dark',
labels = {'count': 'Number of Airbnbs', 'hosting_age': 'Age of Airbnb (years)'},
)
sep_2020_listings_hosting_age.update_layout(
font_family = "Raleway, sans-serif",
title = dict(text=f'<b>Hosting Age of Airbnb Hosts in September 2020</b><br><sup>In September 2020, the average age of a listing in New York City was {sep_2020_average_age:.2f}</sup>'),
legend = dict (
x = .5,
y = -.3,
orientation = 'h',
yanchor = 'bottom',
xanchor = 'center'
)
)
for bar in sep_2020_listings_hosting_age.data:
bar.marker.color = RAINBOW_COLOR_SEQ
# sep_2020_listings_hosting_age

In [21]:
# Export these histograms to images
nov_listings_hosting_age.write_image('../img/nov_host_age.png', scale = 6, engine = 'kaleido')
sep_2020_listings_hosting_age.write_image('../img/sep_2020_host_age.png', scale = 6, engine = 'kaleido')